{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 576 candidates, totalling 1728 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.5s\n",
      "[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   23.5s\n",
      "[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   57.2s\n",
      "[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min\n",
      "[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.9min\n",
      "[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed:  6.0min finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best score: 0.983\n",
      "Best parameters set:\n",
      "\tclf__C: 10\n",
      "\tclf__penalty: 'l2'\n",
      "\tvect__max_df: 0.25\n",
      "\tvect__max_features: 5000\n",
      "\tvect__ngram_range: (1, 2)\n",
      "\tvect__stop_words: None\n",
      "\tvect__use_idf: True\n",
      "Accuracy: 0.983488872936\n",
      "Precision: 0.99375\n",
      "Recall: 0.878453038674\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model.logistic import LogisticRegression\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import precision_score, recall_score, accuracy_score\n",
    "\n",
    "\n",
    "pipeline = Pipeline([\n",
    "    ('vect', TfidfVectorizer(stop_words='english')),\n",
    "    ('clf', LogisticRegression())\n",
    "])\n",
    "parameters = {\n",
    "    'vect__max_df': (0.25, 0.5, 0.75),\n",
    "    'vect__stop_words': ('english', None),\n",
    "    'vect__max_features': (2500, 5000, None),\n",
    "    'vect__ngram_range': ((1, 1), (1, 2)),\n",
    "    'vect__use_idf': (True, False),\n",
    "    'clf__penalty': ('l1', 'l2'),\n",
    "    'clf__C': (0.01, 0.1, 1, 10),\n",
    "}\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    df = pd.read_csv('./SMSSpamCollection', delimiter='\\t', header=None)\n",
    "    X = df[1].values\n",
    "    y = df[0].values\n",
    "    label_encoder = LabelEncoder()\n",
    "    y = label_encoder.fit_transform(y)\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
    "    \n",
    "    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy', cv=3)\n",
    "    grid_search.fit(X_train, y_train)\n",
    "    \n",
    "    print('Best score: %0.3f' % grid_search.best_score_)\n",
    "    print('Best parameters set:')\n",
    "    best_parameters = grid_search.best_estimator_.get_params()\n",
    "    for param_name in sorted(parameters.keys()):\n",
    "        print('\\t%s: %r' % (param_name, best_parameters[param_name]))\n",
    "        \n",
    "    predictions = grid_search.predict(X_test)\n",
    "    print('Accuracy: %s' % accuracy_score(y_test, predictions))\n",
    "    print('Precision: %s' % precision_score(y_test, predictions))\n",
    "    print('Recall: %s' % recall_score(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
